Homework 2 - Clustering Dataset

This is a clustering dataset for practicing K-Means and k-NN algorithms.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# Basic IO
filename = 'clustering/dataset.csv'
dataset  = pd.read_csv(filename)

# Randomize Dataset
dataset = dataset.sample(frac=1,random_state=32).reset_index()
dataset.head()


Out[2]:
index x1 x2 y
0 1123 24.8697 14.6393 12
1 2554 23.3448 16.4908 26
2 2362 26.6545 22.6042 24
3 687 21.8222 27.1835 7
4 18 25.8840 6.3294 1

In [3]:
# Split Into Training & Testing Sets
train, test = train_test_split(dataset,test_size=0.33)

# Write to Text Data
train.to_csv('clustering/train_data.csv',index=False)
test.to_csv('clustering/test_data.csv',index=False)

In [4]:
train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2077 entries, 444 to 1262
Data columns (total 4 columns):
index    2077 non-null int64
x1       2077 non-null float64
x2       2077 non-null float64
y        2077 non-null int64
dtypes: float64(2), int64(2)
memory usage: 81.1 KB

In [5]:
test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1023 entries, 1036 to 988
Data columns (total 4 columns):
index    1023 non-null int64
x1       1023 non-null float64
x2       1023 non-null float64
y        1023 non-null int64
dtypes: float64(2), int64(2)
memory usage: 40.0 KB